*Project title: Anime Recommendation System
*Repository: {replace this with your git repository link}
*Team member(s): Chen, Wan Qi (wac45@pitt.edu), Clarchick, Victoria (vlc24@pitt.edu), Gupta, Abhibha (abg96@pitt.edu)
Abstract This is a reflection on the creation of a recommendation system using R. The recommendation system was for the purpose of being able to suggest an anime to different users. The dataset was first explored through a various plots (Eg: cor plot) and the most important features were taken into consideration for the recommendation system. The different variables were evaluated as well to determine the influence between the variables. Each variable was run through an extraction and frequency function and then analyzed. After the data exploration the dataset was run through the recommendation system. We used the ‘hybrid’ approach that weighs different methods based on popularity, randomness and recommendation. The results for our method are provided in the subsequent sections.
library(dplyr)
library(ggplot2)
library(tidyverse)
library(GGally)
library(caret)
library(forcats)
library(recommenderlab)
library(data.table)
library(reshape2)
library(maditr)
# library(ggstatsplot) #for correlation in task 5
csv_Rating <- read.csv(file = "/Users/abhibhagupta/Desktop/rating_complete.csv")
csv_Anime <- read.csv(file = "/Users/abhibhagupta/Desktop/anime.csv")
csv_Genre <- read.csv(file = "/Users/abhibhagupta/Desktop/anime_genre_year3.csv")
df_anime <- csv_Anime
df_rating <- csv_Rating
anime_data <- csv_Anime
rating_data <- csv_Rating
df_genre <- csv_Genre
#Data Exploration
##This is for preprocessing the data
#Read data
##df_anime <- read.csv(file = "C:/Users/set4s/Documents/Data Mining/Data Files/anime.csv")
#df_anime
##df_rating <- read.csv(file = "C:/Users/set4s/Documents/Data Mining/Data Files/rating_complete.csv")
#removing unknowns
###summary(df_rating)
#removing unknowns
df_anime$Ranked<-as.integer(df_anime$Ranked)
df_anime$Score<-as.integer(df_anime$Score)
df_anime$MAL_ID<-as.integer(df_anime$MAL_ID)
df_anime_clean <- as.data.frame(df_anime) %>% na.omit(df_anime)
colnames(df_anime_clean)[1] ="anime_id"
df_rating_clean <- as.data.frame(df_rating)%>% na.omit(df_rating)
df_combine<- left_join(df_rating_clean, df_anime_clean, by="anime_id")
df_com_clean<- df_combine%>%select('user_id':'Plan.to.Watch')
df_com_clean<- as.data.frame(df_com_clean)%>% na.omit(df_com_clean)
colnames(df_com_clean)[3] ="user_rating"
df_com_clean_1000<-df_com_clean[df_com_clean$Ranked<1000,]
df_com_clean_1000<-as.data.frame(df_com_clean_1000)
df_com_clean_20<-df_com_clean[df_com_clean$Ranked<20,]
df_com_clean_20<-as.data.frame(df_com_clean_20)
df_com_clean_100<-df_com_clean[df_com_clean$Ranked<100,]
df_com_clean_100<-as.data.frame(df_com_clean_100)
lowerFn <- function(data, mapping, method = "lm") {
ggplot(data = data, mapping = mapping) +
geom_point(colour = "slategray4", size = 2, shape = 18) +
geom_smooth(method = method, color = "coral3") +
theme_minimal() +
theme(axis.text.x = element_text(size = 8, angle = 45))
}
df_anime_clean %>% arrange(df_anime_clean$Ranked) %>% slice(1:1000) %>%
select( Popularity, Favorites, Score, Ranked, Watching, Members) %>%mutate_all(as.numeric) %>%
ggpairs(lower = list(continuous = wrap(lowerFn, method = "lm")),
diag = list(continuous = wrap("barDiag", fill = 'skyblue1', colour = "skyblue4")),
upper = list(continuous = wrap("cor", size = 5)),cardinality_threshold=50,
progress = FALSE)
Proportionate: Popularity/Rank Watching/Members Score: Favorite, Members Disproportionate: Rank: Favorites, Score,Members Popularity: Members, Score Important Var: Score and Rank
###View Genres
#df_com_clean_1000%>% group_by(Genres) %>%
# summarize(count = n())
###Types of Genres
genres <- c("Action", "Adventure", "Cars", "Comedy", "Dementia", "Demons", "Drama", "Ecchi", "Fantasy", "Game", "Harem", "Historical", "Horror", "Josei", "Kids", "Magic", "Martial Arts", "Mecha", "Military", "Music", "Mystery", "Parody", "Police", "Psychological", "Romance", "Samurai", "School", "Sci-Fi", "Seinen", "Shoujo", "Shounen", "Slice of Life", "Space", "Sports", "Super Power", "Supernatural", "Thriller", "Vampire")
###Function to split Data
genres_df <- data.frame(
Genres = genres,
Count = sapply(genres, function(x) {
sum(str_detect(df_com_clean_1000$Genres, x))
})
)
##Arrange data
genres_df_10<-genres_df%>% arrange(desc(Count)) %>%slice(1:10)
genres_df_5<-genres_df%>% arrange(desc(Count)) %>%slice(1:5)
###Plot the Genres
genres_df %>%
ggplot(aes(x = Count, y = Genres)) +
ggtitle("Count of Animes within Genres") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Count of Animes") +
ylab("Genres") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))
genres_df_10 %>%
ggplot(aes(x = Count, y = fct_reorder(Genres, Count))) +
ggtitle("Top 10 Count of Animes within Genres") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Count of Animes") +
ylab("Genres") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = Count), vjust = -0.2)
genres_df_2 <- data.frame(
Genres = genres,
Rating = sapply(genres, function(x) {
mean(df_com_clean_1000[str_detect(df_com_clean_1000$Genres, x),]$user_rating)
})
)
genres_df_2_5<-genres_df_2%>% arrange(desc(Rating)) %>%slice(1:10)
genres_df_2 %>%
ggplot(aes(x = Rating, y = Genres)) +
ggtitle("Average Ratings for Genre") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Average ratings") +
ylab("Genres") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))
genres_df_2_5 %>%
ggplot(aes(x = Rating, y = fct_reorder(Genres,Rating))) +
ggtitle("Top 10 Average Ratings for Genre") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Average ratings") +
ylab("Genres") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = Rating), vjust = -0.2)
How many users watched this Genre Average rating for Genre 1 Genre that overlaps within top 10 Shows that most watched Genres are not highly rated
# Rating count
#df_com_clean_1000 %>%
# group_by(user_rating) %>%
# summarize(count = n())
df_com_clean %>%
group_by(user_rating) %>%
summarize(count = n()) %>%
ggplot(aes(x = user_rating, y = count)) +
geom_bar(stat = "identity", fill = "#8888ff") +
ggtitle("Rating Distribution") +
xlab("Rating") +
ylab("Occurrences Count") +
scale_x_continuous(n.breaks = 10) +
theme(axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm")) +
geom_text(aes(label = count, vjust = -0.2))
df_com_clean_1000 %>%
group_by(user_rating) %>%
summarize(count = n()) %>%
ggplot(aes(x = user_rating, y = count)) +
geom_bar(stat = "identity", fill = "#8888ff") +
ggtitle("Rating Distribution") +
xlab("Rating") +
ylab("Occurrences Count") +
scale_x_continuous(n.breaks = 10) +
theme(axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = count, vjust = -0.2))
Out of the top 1000 ranked anime most have been rated by aprox 25000 users so the average rating is well distributed
#df_com_clean_1000 %>%
# group_by(anime_id) %>%
# summarize(count = n()) %>%
# slice_head(n = 10)
df_com_clean_1000 %>%
group_by(anime_id) %>%
summarize(count = n()) %>%
ggplot(aes(x = anime_id, y = count)) +
geom_point(alpha = 0.2, color = "#4020dd") +
geom_smooth(color = "red") +
ggtitle("Number of Ratings per anime") +
xlab("anime id") +
ylab("Number of ratings") +
scale_x_continuous(n.breaks = 10) +
theme(axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))
df_com_clean_1000 %>%
group_by(anime_id) %>%
summarize(count = n()) %>%
ggplot(aes(x = count)) +
geom_histogram(fill = "#8888ff", color = "#4020dd") +
ggtitle("anime' rating histogram") +
xlab("Number of ratings") +
ylab("anime id") +
scale_x_log10(n.breaks = 10) +
theme(axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))
df_com_clean_1000 %>%
group_by(user_id) %>%
summarize(count = n()) %>%
ggplot(aes(x = count)) +
geom_histogram(fill = "#8888ff", color = "#4020dd") +
ggtitle("Users' rating histogram") +
xlab("Rating count") +
ylab("Number of Users that Rated") +
scale_x_log10(n.breaks = 10) +
theme(axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))
limit <- 50
user_anime_matrix <- df_com_clean_100 %>%
filter(user_id %in% sample(unique(df_com_clean_100$user_id), limit)) %>%
select(user_id, anime_id, user_rating) %>%
mutate(rating = 1) %>%
spread(anime_id, user_rating) %>%
select(sample(ncol(.), limit)) %>%
as.matrix() %>%
t(.)
user_anime_matrix %>%
image(1:limit, 1:limit,., xlab = "Anime", ylab = "User") +
abline(h = 0:limit + 0.5, v = 0:limit + 0.5, col = "grey") +
title(main = list("User x Anime matrix w/ Rating", cex = 1, font = 2))
## integer(0)
top_studios <- df_com_clean_1000 %>%
group_by(Studios) %>%
summarise(Count = n()) %>%
top_n(20, wt = Count)
top_studios<-top_studios%>%arrange(Count)
#print(top_studios)
###Types of Genres
studios <- c("Manglobe", "CoMix Wave Films", "P.A. Works", "Toei Animation", "TMS Entertainment", "Brain's Base", "Studio Deen", "White Fox", "Wit Studio", "ufotable", "Studio Pierrot", "Studio Ghibli", "Shaft", "Sunrise", "Production I.G", "J.C.Staff", "A-1 Pictures", "Kyoto Animation", "Bones", "Madhouse")
###Function to split Data
studios_df <- data.frame(
Studios = studios,
Count = sapply(studios, function(x) {
sum(str_detect(df_com_clean_1000$Studios, x))
})
)
###Plot the Studios
studios_df_10<-studios_df %>% arrange(desc(Count)) %>%slice(1:10)
studios_df_10 %>%
ggplot(aes(x = Count, y = fct_reorder(Studios,Count))) +
ggtitle("Top 10 Studios Based on the Number of Users that Rated") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Number of ratings") +
ylab("Studios") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = Count), vjust = -0.2)
studios_df_2 <- data.frame(
Studios = studios,
Rating = sapply(studios, function(x) {
mean(df_com_clean_1000[str_detect(df_com_clean_1000$Studios, x),]$user_rating)
})
)
#print(studios_df_2)
studios_df_2_10<-studios_df_2 %>% arrange(desc(Rating)) %>%slice(1:10)
studios_df_2_5<-studios_df_2 %>% arrange(desc(Rating)) %>%slice(1:5)
studios_df_2_10 %>%
ggplot(aes(x = Rating, y = fct_reorder(Studios,Rating))) +
ggtitle("Top 10 Studios by Average Rating") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Average ratings") +
ylab("Studios") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = Rating), vjust = -0.2)
4 Studios that overlap The higher rated studios are watched less Showing that studios that mass produce do not have high quality ratings
top_Producers <- df_com_clean_100 %>%
group_by(Producers) %>%
summarise(Count = n())
top_Producers<-top_Producers%>%arrange(Producers)
###Types of Producers
producers <- c("A-1 Pictures", "ABC Animation", "Amuse", "Animation Do", "Animax", "Aniplex", "Asatsu DK", "Ashi Production", "Asmik Ace", "AT-X", "Audio Highs", "Avex Pictures", "Bandai", "Bandai Namco Entertainment", "Bandai Visual", "Banpresto", "BS Fuji", "BS11", "CA-Cygames Anime Fund", "Crunchyroll SC Anime Fund", "Cyclone Graphics", "Cygames", "DAX Production", "Delfi Sound", "Dentsu", "Docomo Anime Store", "dugout", "FBC", "Fuji TV", "Fujipacific Music", "Global Solutions", "Good Smile Company", "Hakuhodo DY Media Partners", "Hakuhodo DY Music & Pictures", "Hakusensha", "Half H.P Studio", "Hiroshima Television", "JR East Marketing & Communications", "Kadokawa", "Kadokawa Media House", "Kadokawa Pictures Japan", "Kadokawa Shoten", "Kanetsu Investment", "Kansai Telecasting", "K-Factory", "King Records", "KlockWorx", "Kodansha", "Konami", "Kyoraku Industrial Holdings", "Lantis", "Mag Garden", "Mainichi Broadcasting System", "Marvelous", "Media Factory", "Medicos Entertainment", "Miracle Bus", "Miracle Robo", "Mirai-Kojo", "Movic", "My Theater D.D.", "Nagoya Broadcasting Network", "NewGin", "NHK", "Nihon Ad Systems", "Nippon Television Music", "Nippon Television Network", "Nitroplus", "Notes", "Pony Canyon", "Pony Canyon Enterprise", "Quaras", "Rakuonsha", "Sakura Create", "Science SARU", "Seikaisha", "Shochiku", "Shogakukan", "Shogakukan-Shueisha Productions", "Shueisha", "SKY Perfect Well Think", "Sony Music Communications", "Sony Music Entertainment", "Sound Team Don Juan", "Square Enix", "Studio Hibari", "Studio Jack", "Studio Moriken", "Takeshobo", "TAP", "TBS", "TC Entertainment", "Techno Sound", "Toho", "TOHO animation", "Tohokushinsha Film Corporation", "Tokuma Shoten", "Tokyo MX", "Toy's Factory", "Trinity Sound", "Twin Engine", "VAP", "Victor Entertainment", "voque ting", "Warner Bros. Japan", "Yomiuri Telecasting", "YTV", "Annapuru", "Avex Entertainment", "d-rights", "Frontier Works", "Kitty Films", "Madhouse", "Production I.G", "Starchild Records", "TMS Music", "TV Asahi", "TV Tokyo", "Unknown")
producers_df <- data.frame(
Producers = producers,
Count = sapply(producers, function(x) {
sum(str_detect(df_com_clean_100$Producers, x))
})
)
###Plot the Producers
producers_df_10<-producers_df %>% arrange(desc(Count)) %>%slice(1:20)
producers_df_10 %>%
ggplot(aes(x = Count, y = fct_reorder(Producers,Count))) +
ggtitle("Top 20 Producers Based on the Number of Users that Rated") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Number of ratings") +
ylab("Producers") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = Count), vjust = -0.2)
producers_df_2 <- data.frame(
Producers = producers,
Rating = sapply(producers, function(x) {
mean(df_com_clean_100[str_detect(df_com_clean_100$Producers, x),]$user_rating)
})
)
producers_df_2_10<-producers_df_2 %>% arrange(desc(Rating)) %>%slice(1:20)
producers_df_2_5<-producers_df_2 %>% arrange(desc(Rating)) %>%slice(1:5)
producers_df_2_10 %>%
ggplot(aes(x = Rating, y = fct_reorder(Producers,Rating))) +
ggtitle("Top 20 Producers by Average Rating") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Average ratings") +
ylab("Producers") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = Rating), vjust = -0.2)
This category had to be expanded out to top 20 to find 1 common Producer Indicates like studios that there are many producers but within the most watched and highly ranked there are almost no commonalities
top_Licensors <- df_com_clean_100 %>%
group_by(Licensors) %>%
summarise(Count = n())
top_Licensors<-top_Licensors%>%arrange(Licensors)
###Types of Licensors
Licensors <- c("4Kids Entertainment", "Aniplex of America", "Bandai Entertainment", "Central Park Media", "Crunchyroll", "Geneon Entertainment USA", "GKIDS", "Inc.", "Manga Entertainment", "NYAV Post", "Sentai Filmworks", "Tokyopop", "ADV Films", "Discotek Media", "Eleven Arts", "Flatiron Film Company", "Funimation", "NIS America", "Nozomi Entertainment", "Unknown", "VIZ Media", "Walt Disney Studios")
Licensors_df <- data.frame(
Licensors = Licensors,
Count = sapply(Licensors, function(x) {
sum(str_detect(df_com_clean_1000$Licensors, x))
})
)
###Plot the Licensors
Licensors_df_10<-Licensors_df %>% arrange(desc(Count)) %>%slice(1:10)
Licensors_df_5<-Licensors_df %>% arrange(desc(Count)) %>%slice(1:5)
Licensors_df_10 %>%
ggplot(aes(x = Count, y = fct_reorder(Licensors,Count))) +
ggtitle("Top 10 Licensors Based on the Number of Users that Rated") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Number of ratings") +
ylab("Licensors") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = Count), vjust = -0.2)
Licensors_df_2 <- data.frame(
Licensors = Licensors,
Rating = sapply(Licensors, function(x) {
mean(df_com_clean_1000[str_detect(df_com_clean_1000$Licensors, x),]$user_rating)
})
)
Licensors_df_2_10<-Licensors_df_2 %>% arrange(desc(Rating)) %>%slice(1:10)
Licensors_df_2_5<-Licensors_df_2 %>% arrange(desc(Rating)) %>%slice(1:5)
Licensors_df_2_10 %>%
ggplot(aes(x = Rating, y = fct_reorder(Licensors,Rating))) +
ggtitle("Top 10 Licensors by Average Rating") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Average ratings") +
ylab("Licensors") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = Rating), vjust = -0.2)
Similar to Studios and Producer Mass Licensing is equivilant to less desired ratings.
Source_df <- df_com_clean_1000 %>%
group_by(Source) %>%
summarise(Count = n())
Source_df<- as.data.frame(Source_df)
###Plot the Source
Source_df_10<-Source_df %>% arrange(desc(Count)) %>%slice(1:10)
Source_df_10 %>%
ggplot(aes(x = Count , y = fct_reorder(Source,Count))) +
ggtitle("Top 10 Sources VS Number of Users that Rated") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Number of ratings") +
ylab("Source") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = Count), vjust = -0.2)
Source_df_2 <- df_com_clean_1000 %>%
group_by(Source) %>%
summarise_at(vars(user_rating),list(Rating=mean))
Source_df_2_10<-Source_df_2 %>% arrange(desc(Rating)) %>%slice(1:10)
Source_df_2_10 %>%
ggplot(aes(x = Rating, y = fct_reorder(Source,Rating))) +
ggtitle("Top 10 Sources by Average Rating") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Average ratings") +
ylab("Source") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = Rating), vjust = -0.2)
Source_df2 <- df_com_clean %>%
group_by(Source) %>%
summarise(Count = n())
Source_df2<- as.data.frame(Source_df2)
###Plot the Source
Source_df2_10<-Source_df2 %>% arrange(desc(Count)) %>%slice(1:10)
Source_df2_10 %>%
ggplot(aes(x = Count , y = fct_reorder(Source,Count))) +
ggtitle("Top 10 Sources VS Number of Users that Rated") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Number of ratings") +
ylab("Source") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = Count), vjust = -0.2)
Source_df2_2 <- df_com_clean %>%
group_by(Source) %>%
summarise_at(vars(user_rating),list(Rating=mean))
Source_df2_2<- as.data.frame(Source_df2_2)
Source_df2_2_10<-Source_df2_2 %>% arrange(desc(Rating)) %>%slice(1:10)
Source_df2_2_10 %>%
ggplot(aes(x = Rating, y = fct_reorder(Source,Rating))) +
ggtitle("Top 10 Sources by Average Rating") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Average ratings") +
ylab("Source") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = Rating), vjust = -0.2)
Ranked 1-1000 3 common Sources Entire DataSet 6 common Sources Sources that are most watched are not within the highest ratings yet the lower Sources watched are much higher rated
Shows that sources that are watched most often do not have high ratings
Type_df <- df_com_clean_1000 %>%
group_by(Type) %>%
summarise(Count = n())
Type_df<- as.data.frame(Type_df)
###Plot the Type
Type_df %>%
ggplot(aes(x = Count , y = fct_reorder(Type,Count))) +
ggtitle("Type VS Number of Users that Rated") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Number of ratings") +
ylab("Type") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = Count), vjust = -0.2)
Type_df_2 <- df_com_clean_1000 %>%
group_by(Type) %>%
summarise_at(vars(user_rating),list(Rating=mean))
Type_df_2 %>%
ggplot(aes(x = Rating, y = fct_reorder(Type,Rating))) +
ggtitle("Type by Average Rating") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Average ratings") +
ylab("Type") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = Rating), vjust = -0.2)
Type_df2 <- df_com_clean %>%
group_by(Type) %>%
summarise(Count = n())
Type_df2<- as.data.frame(Type_df2)
###Plot the Type
Type_df2 %>%
ggplot(aes(x = Count , y = fct_reorder(Type,Count))) +
ggtitle("Type VS Number of Users that Rated") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Number of ratings") +
ylab("Type") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = Count), vjust = -0.2)
Type_df2_2 <- df_com_clean %>%
group_by(Type) %>%
summarise_at(vars(user_rating),list(Rating=mean))
Type_df2_2 %>%
ggplot(aes(x = Rating, y = fct_reorder(Type,Rating))) +
ggtitle("Type by Average Rating") +
geom_bar(stat = "identity", width = 0.6, fill = "#8888ff") +
xlab("Average ratings") +
ylab("Type") +
theme(plot.title = element_text(vjust = 3.5),
axis.title.x = element_text(vjust = -5, face = "bold"),
axis.title.y = element_text(vjust = 10, face = "bold"),
axis.text.x = element_text(vjust = 1, hjust = 1, angle = 0),
axis.text.y = element_text(vjust = 0.25, hjust = 1, size = 12),
plot.margin = margin(0.7, 0.5, 1, 1.2, "cm"))+
geom_text(aes(label = Rating), vjust = -0.2)
Ranked 1-1000 top 2 are the for both most rated and highest ratings Entire DataSet top 2 are the only changes users will watch TV much more than Movies but the rating is almost the same
##Visualization The distribution of the all anime
df_anime_remove <- subset(csv_Anime, !grepl("Unknown", Genres))
#df_anime_remove
# check score/popularity types
df_anime_remove$Score <- as.numeric(df_anime_remove$Score)
df_anime_remove$Ranked <- as.numeric(df_anime_remove$Ranked)
typeof(df_anime_remove$Ranked)
## [1] "double"
#Relationships between score and popularity
#linear regression
lm_score_prop <-
lm(df_anime_remove$Score ~ df_anime_remove$Popularity)
#Visualization
ggplot(df_anime_remove, aes(df_anime_remove$Popularity, df_anime_remove$Score)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
ggplot(df_anime_remove, aes(df_anime_remove$Ranked, df_anime_remove$Score)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# ggscatterstats(data = df_anime_remove, x = df_anime_remove$Popularity, y = df_anime_remove$Score)
df_anime_remove %>%
ggplot(aes(x = Score)) +
geom_histogram(fill="blue", binwidth = 0.1) +
scale_x_continuous(breaks = seq(0, 10, by = 1)) +
labs(x = "score", y = "number of anime")
Treemaps 1. Genre counts
genre <- c()
for (i in df_anime_remove['Genres'])
{
for (j in (strsplit(i, ",")))
{
j <- gsub("[\" ]", "", j)
#print(j)
#break
genre <- append(genre, j)
}
}
counts <- as.data.frame(table(genre))
head(counts)
## genre Freq
## 1 Action 3888
## 2 Adventure 2957
## 3 Cars 133
## 4 Comedy 6029
## 5 Dementia 512
## 6 Demons 501
#install.packages('treemapify')
library(treemap)
head(counts)
## genre Freq
## 1 Action 3888
## 2 Adventure 2957
## 3 Cars 133
## 4 Comedy 6029
## 5 Dementia 512
## 6 Demons 501
treemap(counts, index = c("genre", "Freq"), vSize = "Freq", vColor = "Freq", type = "value")
Preparing data #finding the type for each anime.
type <- c()
for (i in df_anime_remove['Type'])
{
for (j in (strsplit(i, ",")))
{
j <- gsub("[\" ]", "", j)
#print(j)
#break
type <- append(type, j)
}
}
#finding counts for each type
type_counts <- as.data.frame(table(type))
(type_counts)
## type Freq
## 1 Movie 2995
## 2 Music 1469
## 3 ONA 1900
## 4 OVA 3890
## 5 Special 2218
## 6 TV 4991
## 7 Unknown 36
#finding the source for each anime.
source <- c()
for (i in df_anime_remove['Source'])
{
for (j in (strsplit(i, ",")))
{
j <- gsub("[\" ]", "", j)
#print(j)
#break
source <- append(source, j)
}
}
#finding counts for each source
source_counts <- as.data.frame(table(source))
head(source_counts)
## source Freq
## 1 4-komamanga 288
## 2 Book 112
## 3 Cardgame 64
## 4 Digitalmanga 15
## 5 Game 879
## 6 Lightnovel 768
treemap(type_counts, index = c("type", "Freq"), vSize = "Freq", vColor = "Freq", type = "value")
treemap(source_counts, index = c("source", "Freq"), vSize = "Freq", vColor = "Freq", type = "value")
Studios with the Top 20 most animes #finding the unique source for each anime.
studios <- c()
for (i in df_anime_remove['Studios'])
{
for (j in (strsplit(i, ",")))
{
j <- gsub("[\" ]", "", j)
#print(j)
#break
studios <- append(studios, j)
}
}
#finding the counts for each studios
studios_counts <- as.data.frame(table(studios))
head(studios_counts)
## studios Freq
## 1 10Gauge 7
## 2 1IN 1
## 3 2:10AMAnimation 6
## 4 33Collective 1
## 5 3xCube 1
## 6 81Produce 1
studios_counts_remove <- subset(studios_counts, !grepl("Unknown", studios))
studios_counts_remove %>%
top_n(20, wt=Freq) %>%
ggplot(aes(x=reorder(studios, Freq), y=Freq)) +
geom_bar(stat='identity', fill="skyblue") + coord_flip(y=c(0, 300)) +
labs(x="", y="Number of animes") +
geom_text(aes(label=Freq), hjust=-0.1, size=3)
genre vs year!!!
#install.packages("ggridges")
library(ggridges)
df_genre %>%
ggplot(aes(x = year, y = genre)) +
geom_density_ridges() +
scale_x_continuous(breaks = seq(1960, 2022, by = 10)) +
labs(x = "year", y = "genre") +
scale_point_color_hue(l = 40) +
scale_fill_cyclical(values = c("blue", "green"))
#Reccomendation System
Reading the rating data
##rating_data<- read.csv("/Users/abhibhagupta/Desktop/grad_coursework/sem2/DataMining/data/rating_complete.csv")
head(rating_data)
## user_id anime_id rating
## 1 0 430 9
## 2 0 1004 5
## 3 0 3010 7
## 4 0 570 7
## 5 0 2762 9
## 6 0 431 8
dim(rating_data)
## [1] 57633278 3
Remove any missing values. -> The dataset doesn’t contain any missing values
rating_data <- rating_data[complete.cases(rating_data), ]
dim(rating_data)
## [1] 57633278 3
Reading the Anime data which contains information about all animes
##anime_data<- read.csv("/Users/abhibhagupta/Desktop/grad_coursework/sem2/DataMining/data/anime.csv")
head(anime_data)
## MAL_ID Name Score
## 1 1 Cowboy Bebop 8.78
## 2 5 Cowboy Bebop: Tengoku no Tobira 8.39
## 3 6 Trigun 8.24
## 4 7 Witch Hunter Robin 7.27
## 5 8 Bouken Ou Beet 6.98
## 6 15 Eyeshield 21 7.95
## Genres English.name
## 1 Action, Adventure, Comedy, Drama, Sci-Fi, Space Cowboy Bebop
## 2 Action, Drama, Mystery, Sci-Fi, Space Cowboy Bebop:The Movie
## 3 Action, Sci-Fi, Adventure, Comedy, Drama, Shounen Trigun
## 4 Action, Mystery, Police, Supernatural, Drama, Magic Witch Hunter Robin
## 5 Adventure, Fantasy, Shounen, Supernatural Beet the Vandel Buster
## 6 Action, Sports, Comedy, Shounen Unknown
## Japanese.name Type Episodes
## 1 カウボーイビバップ TV 26
## 2 カウボーイビバップ 天国の扉 Movie 1
## 3 トライガン TV 26
## 4 Witch Hunter ROBIN (ウイッチハンターロビン) TV 26
## 5 冒険王ビィト TV 52
## 6 アイシールド21 TV 145
## Aired Premiered
## 1 Apr 3, 1998 to Apr 24, 1999 Spring 1998
## 2 Sep 1, 2001 Unknown
## 3 Apr 1, 1998 to Sep 30, 1998 Spring 1998
## 4 Jul 2, 2002 to Dec 24, 2002 Summer 2002
## 5 Sep 30, 2004 to Sep 29, 2005 Fall 2004
## 6 Apr 6, 2005 to Mar 19, 2008 Spring 2005
## Producers
## 1 Bandai Visual
## 2 Sunrise, Bandai Visual
## 3 Victor Entertainment
## 4 TV Tokyo, Bandai Visual, Dentsu, Victor Entertainment
## 5 TV Tokyo, Dentsu
## 6 TV Tokyo, Nihon Ad Systems, TV Tokyo Music, Shueisha
## Licensors Studios Source Duration
## 1 Funimation, Bandai Entertainment Sunrise Original 24 min. per ep.
## 2 Sony Pictures Entertainment Bones Original 1 hr. 55 min.
## 3 Funimation, Geneon Entertainment USA Madhouse Manga 24 min. per ep.
## 4 Funimation, Bandai Entertainment Sunrise Original 25 min. per ep.
## 5 Unknown Toei Animation Manga 23 min. per ep.
## 6 VIZ Media, Sentai Filmworks Gallop Manga 23 min. per ep.
## Rating Ranked Popularity Members Favorites Watching
## 1 R - 17+ (violence & profanity) 28.0 39 1251960 61971 105808
## 2 R - 17+ (violence & profanity) 159.0 518 273145 1174 4143
## 3 PG-13 - Teens 13 or older 266.0 201 558913 12944 29113
## 4 PG-13 - Teens 13 or older 2481.0 1467 94683 587 4300
## 5 PG - Children 3710.0 4369 13224 18 642
## 6 PG-13 - Teens 13 or older 604.0 1003 148259 2066 13907
## Completed On.Hold Dropped Plan.to.Watch Score.10 Score.9 Score.8 Score.7
## 1 718161 71513 26678 329800 229170.0 182126.0 131625.0 62330.0
## 2 208333 1935 770 57964 30043.0 49201.0 49505.0 22632.0
## 3 343492 25465 13925 146918 50229.0 75651.0 86142.0 49432.0
## 4 46165 5121 5378 33719 2182.0 4806.0 10128.0 11618.0
## 5 7314 766 1108 3394 312.0 529.0 1242.0 1713.0
## 6 78349 14228 11573 30202 9226.0 14904.0 22811.0 16734.0
## Score.6 Score.5 Score.4 Score.3 Score.2 Score.1
## 1 20688.0 8904.0 3184.0 1357.0 741.0 1580.0
## 2 5805.0 1877.0 577.0 221.0 109.0 379.0
## 3 15376.0 5838.0 1965.0 664.0 316.0 533.0
## 4 5709.0 2920.0 1083.0 353.0 164.0 131.0
## 5 1068.0 634.0 265.0 83.0 50.0 27.0
## 6 6206.0 2621.0 795.0 336.0 140.0 151.0
We are using a subset of the anime data as it is too large to be used on a personal computer.
rating_data = rating_data[1:10000,]
dim(rating_data)
## [1] 10000 3
table(rating_data$user_id)
##
## 0 1 2 3 4 5 6 7 8 10 11 12 13 14 15 16 17 18 19 20
## 35 103 51 315 118 43 311 87 18 4 161 60 25 96 9 247 709 41 679 79
## 21 22 23 24 25 27 28 29 30 31 32 33 34 35 36 37 38 40 41 42
## 398 63 46 6 152 97 75 58 57 1 44 134 185 4 139 86 79 118 175 674
## 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 61 62 63
## 4 395 52 358 285 24 201 72 28 216 493 110 175 23 76 55 86 188 68 38
## 64 65 66 67 68 70 71 72 73
## 281 27 100 166 23 78 1 124 471
For our anime recommendation system, we use recommenderlabs to implement it. We have to convert our matrix into a sparse matrix.
ratingMatrix <- dcast(rating_data, user_id ~ anime_id, value.var = "rating")
dim(ratingMatrix)
## [1] 69 3099
ratingMatrix <- as.matrix(ratingMatrix[,-1]) #remove user_ids
ratingMatrix <- as(ratingMatrix, "realRatingMatrix")
str(ratingMatrix)
## Formal class 'realRatingMatrix' [package "recommenderlab"] with 2 slots
## ..@ data :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## .. .. ..@ i : int [1:10000] 3 6 13 18 21 22 41 50 54 58 ...
## .. .. ..@ p : int [1:3099] 0 11 17 22 23 24 25 28 31 34 ...
## .. .. ..@ Dim : int [1:2] 69 3098
## .. .. ..@ Dimnames:List of 2
## .. .. .. ..$ : NULL
## .. .. .. ..$ : chr [1:3098] "1" "5" "6" "7" ...
## .. .. ..@ x : num [1:10000] 9 6 9 8 9 9 9 10 9 10 ...
## .. .. ..@ factors : list()
## ..@ normalize: NULL
Important parameters that provide us various options for building recommendation systems
recommendation_model <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
names(recommendation_model)
## [1] "HYBRID_realRatingMatrix" "ALS_realRatingMatrix"
## [3] "ALS_implicit_realRatingMatrix" "IBCF_realRatingMatrix"
## [5] "LIBMF_realRatingMatrix" "POPULAR_realRatingMatrix"
## [7] "RANDOM_realRatingMatrix" "RERECOMMEND_realRatingMatrix"
## [9] "SVD_realRatingMatrix" "SVDF_realRatingMatrix"
## [11] "UBCF_realRatingMatrix"
lapply(recommendation_model, "[[", "description")
## $HYBRID_realRatingMatrix
## [1] "Hybrid recommender that aggegates several recommendation strategies using weighted averages."
##
## $ALS_realRatingMatrix
## [1] "Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm."
##
## $ALS_implicit_realRatingMatrix
## [1] "Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm."
##
## $IBCF_realRatingMatrix
## [1] "Recommender based on item-based collaborative filtering."
##
## $LIBMF_realRatingMatrix
## [1] "Matrix factorization with LIBMF via package recosystem (https://cran.r-project.org/web/packages/recosystem/vignettes/introduction.html)."
##
## $POPULAR_realRatingMatrix
## [1] "Recommender based on item popularity."
##
## $RANDOM_realRatingMatrix
## [1] "Produce random recommendations (real ratings)."
##
## $RERECOMMEND_realRatingMatrix
## [1] "Re-recommends highly rated items (real ratings)."
##
## $SVD_realRatingMatrix
## [1] "Recommender based on SVD approximation with column-mean imputation."
##
## $SVDF_realRatingMatrix
## [1] "Recommender based on Funk SVD with gradient descend (https://sifter.org/~simon/journal/20061211.html)."
##
## $UBCF_realRatingMatrix
## [1] "Recommender based on user-based collaborative filtering."
Implement Item Based Collaborative Filtering.
recommendation_model$UBCF_realRatingMatrix$parameters
## $method
## [1] "cosine"
##
## $nn
## [1] 25
##
## $sample
## [1] FALSE
##
## $weighted
## [1] TRUE
##
## $normalize
## [1] "center"
##
## $min_matching_items
## [1] 0
##
## $min_predictive_items
## [1] 0
Exploring Similar Data
Utilizing preferences gathered from numerous other users, collaborative filtering involves recommending animes to consumers. For instance, if user A and user B both enjoy comedy anime, then A will be recommended the anime that B will watch in the future, and vice versa. Therefore, establishing a commonality between the two consumers is necessary for making anime recommendations. We can calculate similarities using a variety of operators, including cosine, pearson, and jaccard, with the aid of recommenderlab.
similarity_mat <- similarity(ratingMatrix[1:4, ],
method = "cosine",
which = "users")
as.matrix(similarity_mat)
## 1 2 3 4
## 1 NA 1.0000000 NA 1.0000000
## 2 1 NA 0.9961698 0.9958885
## 3 NA 0.9961698 NA 0.9969529
## 4 1 0.9958885 0.9969529 NA
image(as.matrix(similarity_mat), main = "User's Similarities")
In the above matrix, each row and column represents a user. We have taken four users and each cell in this matrix represents the similarity that is shared between the two users.
Now, we delineate the similarity that is shared between anime
anime_similarity <- similarity(ratingMatrix[, 1:4], method =
"cosine", which = "items")
as.matrix(anime_similarity)
## 1 5 6 7
## 1 NA 0.9956802 0.9993698 1
## 5 0.9956802 NA 0.9913553 1
## 6 0.9993698 0.9913553 NA NA
## 7 1.0000000 1.0000000 NA NA
image(as.matrix(anime_similarity), main = "Anime similarity")
Extract the most unique ratings
rating_values <- as.vector(ratingMatrix@data)
unique(rating_values)
## [1] 0 9 6 8 10 7 5 4 3 1 2
Create a table of ratings that will display the most unique ratings.
Table_of_Ratings <- table(rating_values) # creating a count of movie ratings
Table_of_Ratings
## rating_values
## 0 1 2 3 4 5 6 7 8 9 10
## 203762 41 40 109 201 552 1033 2240 2596 1758 1430
head(anime_data)
## MAL_ID Name Score
## 1 1 Cowboy Bebop 8.78
## 2 5 Cowboy Bebop: Tengoku no Tobira 8.39
## 3 6 Trigun 8.24
## 4 7 Witch Hunter Robin 7.27
## 5 8 Bouken Ou Beet 6.98
## 6 15 Eyeshield 21 7.95
## Genres English.name
## 1 Action, Adventure, Comedy, Drama, Sci-Fi, Space Cowboy Bebop
## 2 Action, Drama, Mystery, Sci-Fi, Space Cowboy Bebop:The Movie
## 3 Action, Sci-Fi, Adventure, Comedy, Drama, Shounen Trigun
## 4 Action, Mystery, Police, Supernatural, Drama, Magic Witch Hunter Robin
## 5 Adventure, Fantasy, Shounen, Supernatural Beet the Vandel Buster
## 6 Action, Sports, Comedy, Shounen Unknown
## Japanese.name Type Episodes
## 1 カウボーイビバップ TV 26
## 2 カウボーイビバップ 天国の扉 Movie 1
## 3 トライガン TV 26
## 4 Witch Hunter ROBIN (ウイッチハンターロビン) TV 26
## 5 冒険王ビィト TV 52
## 6 アイシールド21 TV 145
## Aired Premiered
## 1 Apr 3, 1998 to Apr 24, 1999 Spring 1998
## 2 Sep 1, 2001 Unknown
## 3 Apr 1, 1998 to Sep 30, 1998 Spring 1998
## 4 Jul 2, 2002 to Dec 24, 2002 Summer 2002
## 5 Sep 30, 2004 to Sep 29, 2005 Fall 2004
## 6 Apr 6, 2005 to Mar 19, 2008 Spring 2005
## Producers
## 1 Bandai Visual
## 2 Sunrise, Bandai Visual
## 3 Victor Entertainment
## 4 TV Tokyo, Bandai Visual, Dentsu, Victor Entertainment
## 5 TV Tokyo, Dentsu
## 6 TV Tokyo, Nihon Ad Systems, TV Tokyo Music, Shueisha
## Licensors Studios Source Duration
## 1 Funimation, Bandai Entertainment Sunrise Original 24 min. per ep.
## 2 Sony Pictures Entertainment Bones Original 1 hr. 55 min.
## 3 Funimation, Geneon Entertainment USA Madhouse Manga 24 min. per ep.
## 4 Funimation, Bandai Entertainment Sunrise Original 25 min. per ep.
## 5 Unknown Toei Animation Manga 23 min. per ep.
## 6 VIZ Media, Sentai Filmworks Gallop Manga 23 min. per ep.
## Rating Ranked Popularity Members Favorites Watching
## 1 R - 17+ (violence & profanity) 28.0 39 1251960 61971 105808
## 2 R - 17+ (violence & profanity) 159.0 518 273145 1174 4143
## 3 PG-13 - Teens 13 or older 266.0 201 558913 12944 29113
## 4 PG-13 - Teens 13 or older 2481.0 1467 94683 587 4300
## 5 PG - Children 3710.0 4369 13224 18 642
## 6 PG-13 - Teens 13 or older 604.0 1003 148259 2066 13907
## Completed On.Hold Dropped Plan.to.Watch Score.10 Score.9 Score.8 Score.7
## 1 718161 71513 26678 329800 229170.0 182126.0 131625.0 62330.0
## 2 208333 1935 770 57964 30043.0 49201.0 49505.0 22632.0
## 3 343492 25465 13925 146918 50229.0 75651.0 86142.0 49432.0
## 4 46165 5121 5378 33719 2182.0 4806.0 10128.0 11618.0
## 5 7314 766 1108 3394 312.0 529.0 1242.0 1713.0
## 6 78349 14228 11573 30202 9226.0 14904.0 22811.0 16734.0
## Score.6 Score.5 Score.4 Score.3 Score.2 Score.1
## 1 20688.0 8904.0 3184.0 1357.0 741.0 1580.0
## 2 5805.0 1877.0 577.0 221.0 109.0 379.0
## 3 15376.0 5838.0 1965.0 664.0 316.0 533.0
## 4 5709.0 2920.0 1083.0 353.0 164.0 131.0
## 5 1068.0 634.0 265.0 83.0 50.0 27.0
## 6 6206.0 2621.0 795.0 336.0 140.0 151.0
Since there are many columns that we do not need, we use a subset of them.
anime_data <- anime_data[, c("MAL_ID", "Name", "Score", "Genres" )]
summary(anime_data)
## MAL_ID Name Score Genres
## Min. : 1 Length:17562 Length:17562 Length:17562
## 1st Qu.: 5954 Class :character Class :character Class :character
## Median :22820 Mode :character Mode :character Mode :character
## Mean :21477
## 3rd Qu.:35625
## Max. :48492
head(anime_data)
## MAL_ID Name Score
## 1 1 Cowboy Bebop 8.78
## 2 5 Cowboy Bebop: Tengoku no Tobira 8.39
## 3 6 Trigun 8.24
## 4 7 Witch Hunter Robin 7.27
## 5 8 Bouken Ou Beet 6.98
## 6 15 Eyeshield 21 7.95
## Genres
## 1 Action, Adventure, Comedy, Drama, Sci-Fi, Space
## 2 Action, Drama, Mystery, Sci-Fi, Space
## 3 Action, Sci-Fi, Adventure, Comedy, Drama, Shounen
## 4 Action, Mystery, Police, Supernatural, Drama, Magic
## 5 Adventure, Fantasy, Shounen, Supernatural
## 6 Action, Sports, Comedy, Shounen
Most Viewed Movies Visualization In this section, we will explore the most viewed animes in our dataset. We will first count the number of views in an anime and then organize them in a table that would group them in descending order.
library(ggplot2)
anime_views <- colCounts(ratingMatrix) # count views for each movie
#print(anime_views)
table_views <- data.frame(anime = names(anime_views),
views = anime_views) # create dataframe of views
#table_views
table_views <- table_views[order(table_views$views,
decreasing = TRUE), ] # sort by number of views
table_views$title <- NA
for (index in 1:dim(table_views)[1]){
table_views[index,3] <- as.character(subset(anime_data,
anime_data$MAL_ID == table_views[index,1])$Name)
}
table_views[1:6,]
## anime views title
## 16498 16498 37 Shingeki no Kyojin
## 1535 1535 36 Death Note
## 199 199 34 Sen to Chihiro no Kamikakushi
## 4224 4224 33 Toradora!
## 11757 11757 32 Sword Art Online
## 5114 5114 31 Fullmetal Alchemist: Brotherhood
ggplot(table_views[1:6, ], aes(x = title, y = views)) +
geom_bar(stat="identity", fill = 'steelblue') +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("Total Views of the Top Animes")
The figure above shows the number of views for the top 6 most watched anime. The anime Shingeki No Kyojin has the highest number of views.
Performing Data Preparation We will conduct data preparation in the following three steps –
Selecting useful data. Normalizing data. Binarizing the data. For finding useful data in our dataset, we have set the threshold for the minimum number of users who have rated a film as 50. This is also same for minimum number of views that are per film. This way, we have filtered a list of watched films from least-watched ones.
anime_ratings <- ratingMatrix[rowCounts(ratingMatrix) > 5,
colCounts(ratingMatrix) > 5]
anime_ratings
## 64 x 493 rating matrix of class 'realRatingMatrix' with 5080 ratings.
minimum_movies<- quantile(rowCounts(anime_ratings), 0.98)
minimum_users <- quantile(colCounts(anime_ratings), 0.98)
image(anime_ratings[rowCounts(anime_ratings) > minimum_movies,
colCounts(anime_ratings) > minimum_users],
main = "Heatmap of the top users and movies")
average_ratings <- rowMeans(anime_ratings)
qplot(average_ratings, fill=I("steelblue"), col=I("red")) +
ggtitle("Distribution of the average rating per user")
normalized_ratings <- normalize(anime_ratings)
image(normalized_ratings[rowCounts(normalized_ratings) > minimum_movies,
colCounts(normalized_ratings) > minimum_users],
main = "Normalized Ratings of the Top Users")
binary_minimum_animes <- quantile(rowCounts(anime_ratings), 0.95)
binary_minimum_users <- quantile(colCounts(anime_ratings), 0.95)
#movies_watched <- binarize(movie_ratings, minRating = 1)
good_rated_animes <- binarize(anime_ratings, minRating = 3)
image(good_rated_animes[rowCounts(anime_ratings) > binary_minimum_animes,
colCounts(anime_ratings) > binary_minimum_users],
main = "Heatmap of the top users and movies")
sampled_data<- sample(x = c(TRUE, FALSE),
size = nrow(anime_ratings),
replace = TRUE,
prob = c(0.8, 0.2))
training_data <- anime_ratings[sampled_data, ]
testing_data <- anime_ratings[!sampled_data, ]
#First approach: Without features.
recommendation_system <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommendation_system$UBCF_realRatingMatrix$parameters
## $method
## [1] "cosine"
##
## $nn
## [1] 25
##
## $sample
## [1] FALSE
##
## $weighted
## [1] TRUE
##
## $normalize
## [1] "center"
##
## $min_matching_items
## [1] 0
##
## $min_predictive_items
## [1] 0
library(caret)
library(tidyr)
# anime_ratings <- as(rating_data, "realRatingMatrix")
e <- evaluationScheme(anime_ratings, method = "split", train = 0.9, given = 1 , goodRating = 5, k = 10)
getData(e, "train")
## 57 x 493 rating matrix of class 'realRatingMatrix' with 4388 ratings.
getData(e, "unknown")
## 7 x 493 rating matrix of class 'realRatingMatrix' with 685 ratings.
getData(e, "known")
## 7 x 493 rating matrix of class 'realRatingMatrix' with 7 ratings.
recommenders <- list(
RANDOM = list(name = "POPULAR", param = NULL),
POPULAR = list(name = "RANDOM", param = NULL),
RERECOMMEND = list(name = "RERECOMMEND", param = NULL))
# for baseline uncomment this
#weights <- c(0.0, 1.0, 0.0)
#hybrid recommender model
weights <- c(6.0, 1.0, 4.0)
# create a user-based CF recommender using training data
r <- Recommender(data = getData(e, "train"), method = "HYBRID", parameter = list(recommenders = recommenders, weights = weights))
p <- predict(object = r, newdata = getData(e, "known") , type = "ratings")
calcPredictionAccuracy(p, getData(e, "unknown"))
## RMSE MSE MAE
## 2.083102 4.339315 1.652481
head(calcPredictionAccuracy(p,getData(e, "unknown") , byUser = TRUE))
## RMSE MSE MAE
## 1 1.312018 1.721392 1.0887865
## 2 2.041245 4.166683 1.6682996
## 3 1.120353 1.255192 0.9330583
## 4 1.090787 1.189817 0.7178225
## 5 2.308843 5.330758 1.7427775
## 6 1.032041 1.065108 0.7722257
p <- predict(r, getData(e, "known"), type = "topNList", n = 10)
p
## Recommendations as 'topNList' with n = 10 for 7 users.
model_info <- getModel(r)
class(model_info)
## [1] "list"
model_info
## $recommenders
## $recommenders$RANDOM
## Recommender of type 'POPULAR' for 'realRatingMatrix'
## learned using 57 users.
##
## $recommenders$POPULAR
## Recommender of type 'RANDOM' for 'realRatingMatrix'
## learned using 57 users.
##
## $recommenders$RERECOMMEND
## Recommender of type 'RERECOMMEND' for 'realRatingMatrix'
## learned using 57 users.
##
##
## $weights
## [1] 0.54545455 0.09090909 0.36363636
Printing the recommendations for the first user.
user1 <- p@items[[1]] # recommendation for the first user
movies_user1 <- p@itemLabels[user1]
movies_user2 <- movies_user1
for (index in 1:10){
movies_user2[index] <- as.character(subset(anime_data,
anime_data$MAL_ID == movies_user1[index])$Name)
}
movies_user2
## [1] "Koe no Katachi" "Ookami Kodomo no Ame to Yuki"
## [3] "Clannad: After Story" "Mushishi"
## [5] "Steins;Gate 0" "Hunter x Hunter (2011)"
## [7] "Baccano!" "Kimi no Na wa."
## [9] "Detroit Metal City" "Koukyoushihen Eureka Seven"
csv_Anime <- read.csv(file = "/Users/abhibhagupta/Desktop/anime.csv")
anime_data <- csv_Anime
head(rating_data)
## user_id anime_id rating
## 1 0 430 9
## 2 0 1004 5
## 3 0 3010 7
## 4 0 570 7
## 5 0 2762 9
## 6 0 431 8
head(anime_data)
## MAL_ID Name Score
## 1 1 Cowboy Bebop 8.78
## 2 5 Cowboy Bebop: Tengoku no Tobira 8.39
## 3 6 Trigun 8.24
## 4 7 Witch Hunter Robin 7.27
## 5 8 Bouken Ou Beet 6.98
## 6 15 Eyeshield 21 7.95
## Genres English.name
## 1 Action, Adventure, Comedy, Drama, Sci-Fi, Space Cowboy Bebop
## 2 Action, Drama, Mystery, Sci-Fi, Space Cowboy Bebop:The Movie
## 3 Action, Sci-Fi, Adventure, Comedy, Drama, Shounen Trigun
## 4 Action, Mystery, Police, Supernatural, Drama, Magic Witch Hunter Robin
## 5 Adventure, Fantasy, Shounen, Supernatural Beet the Vandel Buster
## 6 Action, Sports, Comedy, Shounen Unknown
## Japanese.name Type Episodes
## 1 カウボーイビバップ TV 26
## 2 カウボーイビバップ 天国の扉 Movie 1
## 3 トライガン TV 26
## 4 Witch Hunter ROBIN (ウイッチハンターロビン) TV 26
## 5 冒険王ビィト TV 52
## 6 アイシールド21 TV 145
## Aired Premiered
## 1 Apr 3, 1998 to Apr 24, 1999 Spring 1998
## 2 Sep 1, 2001 Unknown
## 3 Apr 1, 1998 to Sep 30, 1998 Spring 1998
## 4 Jul 2, 2002 to Dec 24, 2002 Summer 2002
## 5 Sep 30, 2004 to Sep 29, 2005 Fall 2004
## 6 Apr 6, 2005 to Mar 19, 2008 Spring 2005
## Producers
## 1 Bandai Visual
## 2 Sunrise, Bandai Visual
## 3 Victor Entertainment
## 4 TV Tokyo, Bandai Visual, Dentsu, Victor Entertainment
## 5 TV Tokyo, Dentsu
## 6 TV Tokyo, Nihon Ad Systems, TV Tokyo Music, Shueisha
## Licensors Studios Source Duration
## 1 Funimation, Bandai Entertainment Sunrise Original 24 min. per ep.
## 2 Sony Pictures Entertainment Bones Original 1 hr. 55 min.
## 3 Funimation, Geneon Entertainment USA Madhouse Manga 24 min. per ep.
## 4 Funimation, Bandai Entertainment Sunrise Original 25 min. per ep.
## 5 Unknown Toei Animation Manga 23 min. per ep.
## 6 VIZ Media, Sentai Filmworks Gallop Manga 23 min. per ep.
## Rating Ranked Popularity Members Favorites Watching
## 1 R - 17+ (violence & profanity) 28.0 39 1251960 61971 105808
## 2 R - 17+ (violence & profanity) 159.0 518 273145 1174 4143
## 3 PG-13 - Teens 13 or older 266.0 201 558913 12944 29113
## 4 PG-13 - Teens 13 or older 2481.0 1467 94683 587 4300
## 5 PG - Children 3710.0 4369 13224 18 642
## 6 PG-13 - Teens 13 or older 604.0 1003 148259 2066 13907
## Completed On.Hold Dropped Plan.to.Watch Score.10 Score.9 Score.8 Score.7
## 1 718161 71513 26678 329800 229170.0 182126.0 131625.0 62330.0
## 2 208333 1935 770 57964 30043.0 49201.0 49505.0 22632.0
## 3 343492 25465 13925 146918 50229.0 75651.0 86142.0 49432.0
## 4 46165 5121 5378 33719 2182.0 4806.0 10128.0 11618.0
## 5 7314 766 1108 3394 312.0 529.0 1242.0 1713.0
## 6 78349 14228 11573 30202 9226.0 14904.0 22811.0 16734.0
## Score.6 Score.5 Score.4 Score.3 Score.2 Score.1
## 1 20688.0 8904.0 3184.0 1357.0 741.0 1580.0
## 2 5805.0 1877.0 577.0 221.0 109.0 379.0
## 3 15376.0 5838.0 1965.0 664.0 316.0 533.0
## 4 5709.0 2920.0 1083.0 353.0 164.0 131.0
## 5 1068.0 634.0 265.0 83.0 50.0 27.0
## 6 6206.0 2621.0 795.0 336.0 140.0 151.0
#change column name so that we can merge dataframes to incorporate the important features ‘Score’ and ‘Rank’
names(rating_data)[2] <- "MAL_ID"
head(rating_data)
## user_id MAL_ID rating
## 1 0 430 9
## 2 0 1004 5
## 3 0 3010 7
## 4 0 570 7
## 5 0 2762 9
## 6 0 431 8
merged_df <- merge(rating_data, anime_data, by = "MAL_ID", all = TRUE)
head(merged_df)
## MAL_ID user_id rating Name Score
## 1 1 57 9 Cowboy Bebop 8.78
## 2 1 6 6 Cowboy Bebop 8.78
## 3 1 22 9 Cowboy Bebop 8.78
## 4 1 62 10 Cowboy Bebop 8.78
## 5 1 19 8 Cowboy Bebop 8.78
## 6 1 14 9 Cowboy Bebop 8.78
## Genres English.name
## 1 Action, Adventure, Comedy, Drama, Sci-Fi, Space Cowboy Bebop
## 2 Action, Adventure, Comedy, Drama, Sci-Fi, Space Cowboy Bebop
## 3 Action, Adventure, Comedy, Drama, Sci-Fi, Space Cowboy Bebop
## 4 Action, Adventure, Comedy, Drama, Sci-Fi, Space Cowboy Bebop
## 5 Action, Adventure, Comedy, Drama, Sci-Fi, Space Cowboy Bebop
## 6 Action, Adventure, Comedy, Drama, Sci-Fi, Space Cowboy Bebop
## Japanese.name Type Episodes Aired Premiered
## 1 カウボーイビバップ TV 26 Apr 3, 1998 to Apr 24, 1999 Spring 1998
## 2 カウボーイビバップ TV 26 Apr 3, 1998 to Apr 24, 1999 Spring 1998
## 3 カウボーイビバップ TV 26 Apr 3, 1998 to Apr 24, 1999 Spring 1998
## 4 カウボーイビバップ TV 26 Apr 3, 1998 to Apr 24, 1999 Spring 1998
## 5 カウボーイビバップ TV 26 Apr 3, 1998 to Apr 24, 1999 Spring 1998
## 6 カウボーイビバップ TV 26 Apr 3, 1998 to Apr 24, 1999 Spring 1998
## Producers Licensors Studios Source
## 1 Bandai Visual Funimation, Bandai Entertainment Sunrise Original
## 2 Bandai Visual Funimation, Bandai Entertainment Sunrise Original
## 3 Bandai Visual Funimation, Bandai Entertainment Sunrise Original
## 4 Bandai Visual Funimation, Bandai Entertainment Sunrise Original
## 5 Bandai Visual Funimation, Bandai Entertainment Sunrise Original
## 6 Bandai Visual Funimation, Bandai Entertainment Sunrise Original
## Duration Rating Ranked Popularity Members
## 1 24 min. per ep. R - 17+ (violence & profanity) 28.0 39 1251960
## 2 24 min. per ep. R - 17+ (violence & profanity) 28.0 39 1251960
## 3 24 min. per ep. R - 17+ (violence & profanity) 28.0 39 1251960
## 4 24 min. per ep. R - 17+ (violence & profanity) 28.0 39 1251960
## 5 24 min. per ep. R - 17+ (violence & profanity) 28.0 39 1251960
## 6 24 min. per ep. R - 17+ (violence & profanity) 28.0 39 1251960
## Favorites Watching Completed On.Hold Dropped Plan.to.Watch Score.10 Score.9
## 1 61971 105808 718161 71513 26678 329800 229170.0 182126.0
## 2 61971 105808 718161 71513 26678 329800 229170.0 182126.0
## 3 61971 105808 718161 71513 26678 329800 229170.0 182126.0
## 4 61971 105808 718161 71513 26678 329800 229170.0 182126.0
## 5 61971 105808 718161 71513 26678 329800 229170.0 182126.0
## 6 61971 105808 718161 71513 26678 329800 229170.0 182126.0
## Score.8 Score.7 Score.6 Score.5 Score.4 Score.3 Score.2 Score.1
## 1 131625.0 62330.0 20688.0 8904.0 3184.0 1357.0 741.0 1580.0
## 2 131625.0 62330.0 20688.0 8904.0 3184.0 1357.0 741.0 1580.0
## 3 131625.0 62330.0 20688.0 8904.0 3184.0 1357.0 741.0 1580.0
## 4 131625.0 62330.0 20688.0 8904.0 3184.0 1357.0 741.0 1580.0
## 5 131625.0 62330.0 20688.0 8904.0 3184.0 1357.0 741.0 1580.0
## 6 131625.0 62330.0 20688.0 8904.0 3184.0 1357.0 741.0 1580.0
subset_df <- merged_df[, c("user_id", "MAL_ID", "rating", "Score", "Ranked")]
head(subset_df)
## user_id MAL_ID rating Score Ranked
## 1 57 1 9 8.78 28.0
## 2 6 1 6 8.78 28.0
## 3 22 1 9 8.78 28.0
## 4 62 1 10 8.78 28.0
## 5 19 1 8 8.78 28.0
## 6 14 1 9 8.78 28.0
df_norm <- subset_df
df_norm[c( "Score", "Ranked")] <- lapply(df_norm[, c( "Score", "Ranked")], as.numeric)
df_norm[ c( "Score", "Ranked")] <- scale(df_norm[, c( "Score", "Ranked")], center = TRUE, scale = TRUE)
#uniform data type
df_norm$MAL_ID <- as.double(unlist(df_norm$MAL_ID ))
df_norm$user_id <- as.double(unlist(df_norm$user_id))
df_norm$Score <- as.double(unlist(df_norm$Score ))
df_norm$Ranked <- as.double(unlist(df_norm$Ranked))
df_norm$rating <- as.double(unlist(df_norm$rating))
# remove Na values
df_norm <- na.omit(df_norm)
#convert into sparse matrix
mat <- as(df_norm, "realRatingMatrix")
e <- evaluationScheme(mat, method = "split", train = 0.9, given = 1 , goodRating = 5, k = 10)
getData(e, "train")
## 62 x 2868 rating matrix of class 'realRatingMatrix' with 8253 ratings.
getData(e, "unknown")
## 7 x 2868 rating matrix of class 'realRatingMatrix' with 1471 ratings.
getData(e, "known")
## 7 x 2868 rating matrix of class 'realRatingMatrix' with 7 ratings.
recommenders <- list(
RANDOM = list(name = "POPULAR", param = NULL),
POPULAR = list(name = "RANDOM", param = NULL),
RERECOMMEND = list(name = "RERECOMMEND", param = NULL))
# for baseline uncomment this
#weights <- c(0.0, 1.0, 0.0)
#hybrid recommender model
weights <- c(6.0, 1.0, 4.0)
# create a user-based CF recommender using training data
r <- Recommender(data = getData(e, "train"), method = "HYBRID", parameter = list(recommenders = recommenders, weights = weights))
p <- predict(object = r, newdata = getData(e, "known") , type = "ratings")
calcPredictionAccuracy(p, getData(e, "unknown"))
## RMSE MSE MAE
## 2.475789 6.129530 2.022432
head(calcPredictionAccuracy(p,getData(e, "unknown") , byUser = TRUE))
## RMSE MSE MAE
## 4 1.499396 2.248190 1.137540
## 15 2.367594 5.605499 2.089749
## 17 2.370959 5.621448 1.841630
## 20 3.198636 10.231273 2.933034
## 28 2.574678 6.628969 2.129784
## 68 2.614120 6.833624 1.560343
p <- predict(r, getData(e, "known"), type = "topNList", n = 10)
p
## Recommendations as 'topNList' with n = 10 for 7 users.
user1 <- p@items[[1]] # recommendation for the first user
movies_user1 <- p@itemLabels[user1]
movies_user2 <- movies_user1
for (index in 1:10){
movies_user2[index] <- as.character(subset(anime_data,
anime_data$MAL_ID == movies_user1[index])$Name)
}
movies_user2
## [1] "Heartcatch Precure!"
## [2] "Bakumatsu Rock"
## [3] "Watashi no Coffee Samurai: Jihanki-teki na Kareshi"
## [4] "Owarimonogatari 2nd Season"
## [5] "Liz to Aoi Tori"
## [6] "Kuroko no Basket NG-shuu"
## [7] "Tekken: Blood Vengeance"
## [8] "Demi-chan wa Kataritai: Demi-chan no Natsuyasumi"
## [9] "Kishin Houkou Demonbane (TV)"
## [10] "Magi: Sinbad no Bouken"
#Topic modelling Now we will try to find the topics most watched by the user for which we predicted the anime
library(dplyr)
rec_anime <- as.data.frame(movies_user2)
names(rec_anime)[1] <- "Name"
synopsis_data<- read.csv("/Users/abhibhagupta/Desktop/grad_coursework/sem2/DataMining/data/anime_with_synopsis.csv")
head(synopsis_data)
## MAL_ID Name Score
## 1 1 Cowboy Bebop 8.78
## 2 5 Cowboy Bebop: Tengoku no Tobira 8.39
## 3 6 Trigun 8.24
## 4 7 Witch Hunter Robin 7.27
## 5 8 Bouken Ou Beet 6.98
## 6 15 Eyeshield 21 7.95
## Genres
## 1 Action, Adventure, Comedy, Drama, Sci-Fi, Space
## 2 Action, Drama, Mystery, Sci-Fi, Space
## 3 Action, Sci-Fi, Adventure, Comedy, Drama, Shounen
## 4 Action, Mystery, Police, Supernatural, Drama, Magic
## 5 Adventure, Fantasy, Shounen, Supernatural
## 6 Action, Sports, Comedy, Shounen
## sypnopsis
## 1 In the year 2071, humanity has colonized several of the planets and moons of the solar system leaving the now uninhabitable surface of planet Earth behind. The Inter Solar System Police attempts to keep peace in the galaxy, aided in part by outlaw bounty hunters, referred to as "Cowboys." The ragtag team aboard the spaceship Bebop are two such individuals. Mellow and carefree Spike Spiegel is balanced by his boisterous, pragmatic partner Jet Black as the pair makes a living chasing bounties and collecting rewards. Thrown off course by the addition of new members that they meet in their travels—Ein, a genetically engineered, highly intelligent Welsh Corgi; femme fatale Faye Valentine, an enigmatic trickster with memory loss; and the strange computer whiz kid Edward Wong—the crew embarks on thrilling adventures that unravel each member's dark and mysterious past little by little. Well-balanced with high density action and light-hearted comedy, Cowboy Bebop is a space Western classic and an homage to the smooth and improvised music it is named after.
## 2 other day, another bounty—such is the life of the often unlucky crew of the Bebop. However, this routine is interrupted when Faye, who is chasing a fairly worthless target on Mars, witnesses an oil tanker suddenly explode, causing mass hysteria. As casualties mount due to a strange disease spreading through the smoke from the blast, a whopping three hundred million woolong price is placed on the head of the supposed perpetrator. With lives at stake and a solution to their money problems in sight, the Bebop crew springs into action. Spike, Jet, Faye, and Edward, followed closely by Ein, split up to pursue different leads across Alba City. Through their individual investigations, they discover a cover-up scheme involving a pharmaceutical company, revealing a plot that reaches much further than the ragtag team of bounty hunters could have realized.
## 3 Vash the Stampede is the man with a $$60,000,000,000 bounty on his head. The reason: he's a merciless villain who lays waste to all those that oppose him and flattens entire cities for fun, garnering him the title "The Humanoid Typhoon." He leaves a trail of death and destruction wherever he goes, and anyone can count themselves dead if they so much as make eye contact—or so the rumors say. In actuality, Vash is a huge softie who claims to have never taken a life and avoids violence at all costs. With his crazy doughnut obsession and buffoonish attitude in tow, Vash traverses the wasteland of the planet Gunsmoke, all the while followed by two insurance agents, Meryl Stryfe and Milly Thompson, who attempt to minimize his impact on the public. But soon, their misadventures evolve into life-or-death situations as a group of legendary assassins are summoned to bring about suffering to the trio. Vash's agonizing past will be unraveled and his morality and principles pushed to the breaking point.
## 4 ches are individuals with special powers like ESP, telekinesis, mind control, etc. Robin, a 15-year-old craft user, arrives from Italy to Japan to work for an organization named STN Japan Division (STN-J) as a replacement for one of STN-J's witch hunters who was recently killed. Unlike other divisions of STN, STN-J tries to capture the witches alive in order to learn why and how they became witches in the first place. (Source: ANN)
## 5 It is the dark century and the people are suffering under the rule of the devil, Vandel, who is able to manipulate monsters. The Vandel Busters are a group of people who hunt these devils, and among them, the Zenon Squad is known to be the strongest busters on the continent. A young boy, Beet, dreams of joining the Zenon Squad. However, one day, as a result of Beet's fault, the Zenon squad was defeated by the devil, Beltose. The five dying busters sacrificed their life power into their five weapons, Saiga. After giving their weapons to Beet, they passed away. Years have passed since then and the young Vandel Buster, Beet, begins his adventure to carry out the Zenon Squad's will to put an end to the dark century.
## 6 Sena is like any other shy kid starting high school; he's just trying to survive. Constantly bullied, he's accustomed to running away. Surviving high school is about to become a lot more difficult after Hiruma, captain of the school's American football team, witnesses Sena's incredible agility and speed during an escape from some bullies. Hiruma schemes to make Sena the running back of his school team, The Devil Bats, hoping that it will turn around the squad's fortunes from being the laughingstock of Japan's high school leagues, to title contender. To protect his precious star player from rivaling recruiters, he enlists Sena as "team secretary," giving him a visored helmet and the nickname "Eyeshield 21" to hide his identity. The Devilbats will look to make their way to the Christmas Bowl, an annual tournament attended by the best football teams in Japan, with "Eyeshield 21" leading the way. Will they be able to win the Christmas Bowl? Will Sena be able to transform from a timid, undersized freshman to an all-star player? Put on your pads and helmet to find out!
left_join_df <- merge(rec_anime, synopsis_data, by = "Name", all.x = TRUE)
head(left_join_df)
## Name MAL_ID Score
## 1 Bakumatsu Rock 23037 6.05
## 2 Demi-chan wa Kataritai: Demi-chan no Natsuyasumi 35823 7.53
## 3 Heartcatch Precure! 7645 7.79
## 4 Kishin Houkou Demonbane (TV) 1067 6.58
## 5 Kuroko no Basket NG-shuu 15487 7.69
## 6 Liz to Aoi Tori 35677 8.22
## Genres
## 1 Action, Music, Comedy, Historical, Shoujo
## 2 Comedy, Vampire, Fantasy, School, Seinen
## 3 Action, Slice of Life, Comedy, Magic, Fantasy, School, Shoujo
## 4 Action, Harem, Magic, Romance, Ecchi, Mecha
## 5 Comedy, School, Shounen, Sports
## 6 Drama, Music, School
## sypnopsis
## 1 ouma Sakamoto wants everyone to know about his passion for rock 'n' roll, so he roams around town with his electric guitar willing to show anyone he encounters that he's just as skilled as the famous Shinsengumi stars they admire. Unfortunately, Japan doesn't allow anything other than that group's Heaven's Songs, for writing or performing different types of music is forbidden and can lead to harsh consequences. Agitated by these strict rules and brainwashing, Ryouma does everything he can to show people that the music he loves will bring them the freedom they deserve. Along with his bandmates Shinsaku Takasugi and Kogoru Katsura, Ryouma works hard to find places for his rock 'n' roll group to perform. Refusing to back down until their music is accepted in Japan, the trio begin to realize that there's more to their passion than they had thought.
## 2 During summer break, Tetsuo asks Sakie to join him on patrol duty for the local summer festival, which both the demis and Himari and her friends go to. As each group decides to check out a nearby shrine, they get spooked by some strange occurrences, which turn out to be each other. As the next term begins, Tetsuo converses with an invisible woman named Matsuri. (Source: Crunchyroll)
## 3 Young flower enthusiast Tsubomi Hanasaki is often modest and quiet. But with her family moving to a new town, she aims to reinvent her image at her new school as someone more confident and outgoing. On moving day, she dreams of a mysterious tree in the sky guarded by a warrior named "Cure Moonlight." Tsubomi quickly learns that this was no ordinary dream when she encounters two mysterious fairies—Chypre and Coffret—who are being hunted down by a strange woman. When the woman summons a giant monster to attack the city, Tsubomi finds herself transforming into a warrior to fight the enemy! Taking on the alias "Cure Blossom," Tsubomi learns that the woman is part of a villainous group that aims to turn the world into a lifeless desert, with her new duty being to stop it from happening. As Tsubomi continues to battle more monsters and uncover the secrets behind Cure Moonlight, will she find the confidence needed to overcome her timid nature?
## 4 Kurou Daijuuji is a poor detective living in Arkham City. One day, he was requested by Ruri Hado of Hado Financial Group, to search for a magic book. While he initially refused, Ruri offered him a large sum of money upon completion of her request, in which bribed Kuro to accept. As Kurou searches for the book, he unexpectedly runs into Al, a pretty girl that is actually a powerful grimoire. They forge a contract with each other, bestowing Kuro with powerful magic. Soon afterwards, Al also activates Demonbane, a deus machina owned by the Hado Financial Group, to combat the mechanical menace from the Black Lodge. With this, the war between the Hado Financial Group and the Black Lodge begins.... (Source: ANN)
## 5 mated bloopers, based on the extra section of the original manga series, included with the BD/DVD series for Kuroko no Basket (both limited and normal editions).
## 6 z's days of solitude come to an end when she meets a blue bird in the form of a young girl. Although their relationship blossoms, Liz must make a heart-wrenching decision in order to truly realize her love for Blue Bird. High school seniors and close friends Mizore Yoroizuka and Nozomi Kasaki are tasked to play the lead instruments in the third movement of Liz and the Blue Bird, a concert band piece inspired by this fairy tale. The introverted and reserved Mizore plays the oboe, representing the kind and gentle Liz. Meanwhile, the radiant and popular Nozomi plays the flute, portraying the cheerful and energetic Blue Bird. However, as they rehearse, the distance between Mizore and Nozomi seems to grow. Their disjointed duet disappoints the band, and with graduation on the horizon, uncertainty about the future spurs complicated emotions. With little time to improve as their performance draws near, they desperately attempt to connect with their respective characters. But when Mizore and Nozomi consider the story from a brand-new perspective, will the girls find the strength to face harsh realities? A spin-off film adaptation of the Hibike Euphonium! series, Liz to Aoi Tori dances between the parallels of a charming fairy tale, a moving musical piece, and a delicate high school friendship.
#install.packages("tidytext")
library(tidyverse) # general utility & workflow functions
library(tidytext) # tidy implimentation of NLP methods
library(topicmodels) # for LDA topic modelling
library(tm) # general text mining functions, making document term matrixes
library(SnowballC) #
library(stringi)
# function to get & plot the most informative terms by a specificed number
# of topics, using LDA
top_terms_by_topic_LDA <- function(input_text, # should be a columm from a dataframe
plot = T, # return a plot? TRUE by defult
number_of_topics = 4) # number of topics (4 by default)
{
# create a corpus (type of object expected by tm) and document term matrix
Corpus <- Corpus(VectorSource(input_text)) # make a corpus object
DTM <- DocumentTermMatrix(Corpus) # get the count of words/document
# remove any empty rows in our document term matrix (if there are any
# we'll get an error when we try to run our LDA)
unique_indexes <- unique(DTM$i) # get the index of each unique value
DTM <- DTM[unique_indexes,] # get a subset of only those indexes
# preform LDA & get the words/topic in a tidy text format
lda <- LDA(DTM, k = number_of_topics, control = list(seed = 1234))
topics <- tidy(lda, matrix = "beta")
# get the top ten terms for each topic
top_terms <- topics %>% # take the topics data frame and..
group_by(topic) %>% # treat each topic as a different group
top_n(10, beta) %>% # get the top 10 most informative words
ungroup() %>% # ungroup
arrange(topic, -beta) # arrange words in descending informativeness
# if the user asks for a plot (TRUE by default)
if(plot == T){
# plot the top ten terms for each topic in order
top_terms %>% # take the top terms
mutate(term = reorder(term, beta)) %>% # sort terms by beta value
ggplot(aes(term, beta, fill = factor(topic))) + # plot beta by theme
geom_col(show.legend = FALSE) + # as a bar plot
facet_wrap(~ topic, scales = "free") + # which each topic in a seperate plot
labs(x = NULL, y = "Beta") + # no x label, change y label
coord_flip() # turn bars sideways
}else{
# if the user does not request a plot
# return a list of sorted terms instead
return(top_terms)
}
}
# create a document term matrix to clean
reviewsCorpus <- Corpus(VectorSource(left_join_df$sypnopsis))
reviewsDTM <- DocumentTermMatrix(reviewsCorpus)
# convert the document term matrix to a tidytext corpus
reviewsDTM_tidy <- tidy(reviewsDTM)
# I'm going to add my own custom stop words that I don't think will be
# very informative in hotel reviews
custom_stop_words <- tibble(word = c("hotel", "room"))
# remove stopwords
reviewsDTM_tidy_cleaned <- reviewsDTM_tidy %>% # take our tidy dtm and...
anti_join(stop_words, by = c("term" = "word")) %>% # remove English stopwords and...
anti_join(custom_stop_words, by = c("term" = "word")) # remove my custom stopwords
# reconstruct cleaned documents (so that each word shows up the correct number of times)
cleaned_documents <- reviewsDTM_tidy_cleaned %>%
group_by(document) %>%
mutate(terms = toString(rep(term, count))) %>%
select(document, terms) %>%
unique()
library(topicmodels)
top_terms_by_topic_LDA(cleaned_documents$terms, number_of_topics = 10)
#install.packages("wordcloud")
library(wordcloud)
#install.packages("RColorBrewer")
library(RColorBrewer)
#install.packages("wordcloud2")
library(wordcloud2)
docs <- Corpus(VectorSource(left_join_df$sypnopsis))
docs
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 10
docs <- docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("english"))
dtm <- TermDocumentMatrix(docs)
matrix <- as.matrix(dtm)
words <- sort(rowSums(matrix),decreasing=TRUE)
df <- data.frame(word = names(words),freq=words)
library(tm)
library(tmap)
#install.packages("tmaptools")
#install.packages("leafem")
library(wordcloud)
set.seed(1234) # for reproducibility
wordcloud(words = df$word, freq = df$freq, min.freq = 10, max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"), scale=c(2, 0.01))